############################################################
# Analyze self-reported behaviors from past week 
# (these are pre-treatment outcomes, noting that info treatments
# had precisely null effects).
# 
# Author: 
# Soubhik Barari
# 
# Environment:
# - must use R 3.6
# 
# Input:
# - 00-read_clean_data.R
#
# Output:
# - fig/behaviors_past_week{.png, .pdf}
# - fig/behavior_x_group{.png, .pdf}
# - fig/behavior_overall_x_group{.png, .pdf}
# - fig/behavior_predictors{.png,.pdf}
############################################################

source("00-read_clean_data.R")

#----------------------------------------------------------
# density of scores
#----------------------------------------------------------

score_viz_df <- responses_df %>% 
	tidyr::gather("behavior", "value", c("SelfReported_Behavio_1", "SelfReported_Behavio_2", "SelfReported_Behavio_3", "SelfReported_Behavio_4", "SelfReported_Behavio_5")) %>%
	mutate(value=as.numeric(value)) %>%
    group_by(behavior) %>%
    filter(value != 0) %>% ##these are almost entirely unanswered, not actual zeroes
    mutate(avg=mean(value, na.rm=T)) %>%
    ungroup() %>%
	mutate(behavior = 
		ifelse(behavior == "SelfReported_Behavio_1", sprintf("Stayed\nat home"),
		ifelse(behavior == "SelfReported_Behavio_2", sprintf("Avoided social\ngatherings"),
		ifelse(behavior == "SelfReported_Behavio_3", sprintf("Washed hands\nmore frequently"),
		ifelse(behavior == "SelfReported_Behavio_4", sprintf("Kept social distance\n of 2m from others"),
		ifelse(behavior == "SelfReported_Behavio_5", sprintf("Inform others\nof any symptoms", avg), "")
	))))) %>%
    arrange(avg)
score_viz_df$behavior <- factor(score_viz_df$behavior, levels = unique(score_viz_df$behavior))
        

ggplot(score_viz_df) + ylab("density") + xlab(sprintf("extent of engagement in behavior last week (n=%i)", nrow(responses_df))) + 
	geom_density(aes(x=value, y=..scaled..), fill="gray32", colour="gray32") + 
	facet_grid(. ~ behavior, scales = "free") +
    geom_segment(data = score_viz_df %>% group_by(behavior) %>% summarise(v=head(avg,1)), aes(x=v, xend=v, y=0, yend=0.9), colour="black", lty=2) +
    geom_text(data = score_viz_df %>% group_by(behavior) %>% summarise(v=head(avg,1)), aes(label=sprintf("%0.1f", v), x=v-5, y=0.93)) +
    theme_bw()
ggsave("fig/behaviors_past_week.pdf", height=3, width=8)
ggsave("fig/behaviors_past_week.png", height=3, width=8)

# responses_df %>%
#     ggplot(aes(x=past_behavior_index)) +
#     geom_density(aes(y=..scaled..), fill="gray32", colour="gray32") + 
#     ylab("density") + 
#     xlab(sprintf("healthy behavior score (out of 100, n=%d)", nrow(responses_df))) +
#     geom_vline(xintercept=mean(responses_df$past_behavior_index, na.rm=T), colour="red") +
#     theme_bw()
# ggsave("fig/behavior_score_past_week.pdf", height=4, width=6)
# ggsave("fig/behavior_score_past_week.png", height=4, width=6)

#----------------------------------------------------------
# descriptive plot (individual behavior x demographic)
#----------------------------------------------------------

b_long_df <- responses_df %>%
    tidyr::gather("behavior", "value", c("SelfReported_Behavio_1", "SelfReported_Behavio_2", "SelfReported_Behavio_3", "SelfReported_Behavio_4", "SelfReported_Behavio_5")) %>%
    mutate(value=as.numeric(value)) %>%
    filter(value != 0) %>% ##these are almost entirely unanswered, not actual zeroes
    mutate(behavior = 
               ifelse(behavior == "SelfReported_Behavio_1", "Stayed at home",
                      ifelse(behavior == "SelfReported_Behavio_2", "Avoided social\ngatherings",
                             ifelse(behavior == "SelfReported_Behavio_3", "Washed hands\nmore frequently",
                                    ifelse(behavior == "SelfReported_Behavio_4", "Kept social distance\n of 2m from others",
                                           ifelse(behavior == "SelfReported_Behavio_5", "Inform others\nof any symptoms", "")
                                    )))))
b_long_df$behavior <- factor(b_long_df$behavior, 
                             levels = c(
                                 "Kept social distance\n of 2m from others",
                                 "Washed hands\nmore frequently",
                                 "Stayed at home",
                                 "Avoided social\ngatherings",
                                 "Inform others\nof any symptoms"
                             ))

library(scales)
b_long_df %>% 
    filter(gender != "Other") %>%
    tidyr::gather("variable", "variable_value", c("gender", "health", "age_group")) %>%
    select(variable, variable_value, behavior, value) %>%
    mutate(variable = 
               ifelse(variable == "age_group", "age group",
                      ifelse(variable == "health", "general health",
                             ifelse(variable == "past_healthy_behavior", "past healthy\nbehavior", "gender"
                             )))) %>%
    mutate(variable_value = factor(variable_value, levels=c(
        "18-29", "30-39", "40-49", "50-59", "60+", "Poor", "Fair", "Excellent", "Male", "Female", "Other"
    ))) %>%
    group_by(variable, variable_value, behavior) %>%
    summarise(
        y = mean(value, na.rm=T),
        ymin = mean(value, na.rm=T)-(1.96*sd(value, na.rm=T)/sqrt(n())),
        ymax = mean(value, na.rm=T)+(1.96*sd(value, na.rm=T)/sqrt(n()))
    ) %>%
    filter(!is.na(variable_value) & variable_value != "") %>%
    ggplot(aes(x=variable_value, y=y)) +
    geom_text(aes(label=sprintf("%0.1f", y), x=variable_value, y=ymax+3.8), size=3) +
    geom_bar(stat="identity", position = position_dodge(width=0.9)) +
    geom_errorbar(aes(ymin=ymin, ymax=ymax), position = position_dodge(width=0.9), width = 0, size=1) +
    coord_flip() +
    scale_y_continuous(limits=c(78, 103),oob = rescale_none) +
    facet_grid(variable ~ behavior, scales = "free", space = "free") +
    # facet_grid(variable ~ behavior) +
    xlab("") + ylab("average self-reported grade") +
    theme_bw()
ggsave("fig/behavior_x_group.pdf", width=8, height=4)
ggsave("fig/behavior_x_group.png", width=8, height=4)

#----------------------------------------------------------
# descriptive plot (overall behavior x demographic)
#----------------------------------------------------------

b_long_df %>% 
    tidyr::gather("variable", "variable_value", c("gender", "health", "age_group")) %>%
    select(variable, variable_value, past_behavior_index) %>%
    mutate(variable = 
               ifelse(variable == "age_group", "age group",
                      ifelse(variable == "health", "general health",
                             ifelse(variable == "past_healthy_behavior", "past healthy\nbehavior", "gender"
                             )))) %>%
    mutate(variable_value = factor(variable_value, levels=c(
        "18-29", "30-39", "40-49", "50-59", "60+", "Poor", "Fair", "Excellent", "Male", "Female", "Other"
    ))) %>%
    group_by(variable, variable_value) %>%
    summarise(
        y = mean(past_behavior_index, na.rm=T),
        ymin = mean(past_behavior_index, na.rm=T)-(sd(past_behavior_index, na.rm=T)/sqrt(5)),
        ymax = mean(past_behavior_index, na.rm=T)+(sd(past_behavior_index, na.rm=T)/sqrt(5))
    ) %>%
    filter(!is.na(variable_value) & variable_value != "") %>%
    ggplot(aes(x=variable_value, y=y)) +
    geom_bar(stat="identity", position = position_dodge(width=0.9)) +
    # geom_errorbar(aes(ymin=ymin, ymax=ymax), position = position_dodge(width=0.9), width = 0.25) +
    coord_flip() +
    facet_grid(variable ~., scales = "free", space = "free") +
    xlab("") + ylab("average self-reported grade") +
    theme_bw()
ggsave("fig/behavior_overall_x_group.pdf", width=5, height=4)
ggsave("fig/behavior_overall_x_group.png", width=5, height=4)

#----------------------------------------------------------
# predictors of individual behaviors
#----------------------------------------------------------

beh_coef_df <- data.frame()

beh <- lm(SelfReported_Behavio_1 ~ age_group + health + gender, responses_df)
summary(beh)
beh_coef <- summary(beh)$coefficients
beh_coef_df <- rbind(beh_coef_df, data.frame(
    y=beh_coef[,1], 
    variable=rownames(beh_coef),
    ymin=beh_coef[,1]-beh_coef[,2],
    ymax=beh_coef[,1]+beh_coef[,2],
    behavior="Stayed at home"
))

beh <- lm(SelfReported_Behavio_2 ~ age_group + health + gender, responses_df)
summary(beh)
beh_coef <- summary(beh)$coefficients
beh_coef_df <- rbind(beh_coef_df, data.frame(
    y=beh_coef[,1], 
    variable=rownames(beh_coef),
    ymin=beh_coef[,1]-beh_coef[,2],
    ymax=beh_coef[,1]+beh_coef[,2],
    behavior="Avoided social\ngatherings"
))

beh <- lm(SelfReported_Behavio_3 ~ age_group + health + gender, responses_df)
summary(beh)
beh_coef <- summary(beh)$coefficients
beh_coef_df <- rbind(beh_coef_df, data.frame(
    y=beh_coef[,1], 
    variable=rownames(beh_coef),
    ymin=beh_coef[,1]-beh_coef[,2],
    ymax=beh_coef[,1]+beh_coef[,2],
    behavior="Washed hands\nmore frequently"
))

beh <- lm(SelfReported_Behavio_4 ~ age_group + health + gender, responses_df)
summary(beh)
beh_coef <- summary(beh)$coefficients
beh_coef_df <- rbind(beh_coef_df, data.frame(
    y=beh_coef[,1], 
    variable=rownames(beh_coef),
    ymin=beh_coef[,1]-beh_coef[,2],
    ymax=beh_coef[,1]+beh_coef[,2],
    behavior="Inform others\nof any symptoms"
))

beh <- lm(SelfReported_Behavio_5 ~ age_group + health + gender, responses_df)
summary(beh)
beh_coef <- summary(beh)$coefficients
beh_coef_df <- rbind(beh_coef_df, data.frame(
    y=beh_coef[,1], 
    variable=rownames(beh_coef),
    ymin=beh_coef[,1]-beh_coef[,2],
    ymax=beh_coef[,1]+beh_coef[,2],
    behavior="Kept social distance\n of 2m from others"
))

levels(beh_coef_df$variable) <- c(
    "(Intercept)", "age_group30-39", "age_group40-49", "age_group50-59",
    "age_group60+", "genderFemale", "genderOther", "healthExcellent",
    "healthGood", "healthFair"
)
beh_coef_df %>%
    mutate(y=y, ymin=ymin, ymax=ymax) %>%
    mutate(sig=factor(ifelse(ymin < 0 & ymax > 0, 0, 1))) %>%
    filter(variable != "(Intercept)") %>%
    filter(variable != "genderOther") %>%
    ggplot(aes(x=variable, y=y, ymin=ymin, ymax=ymax)) +
    geom_point(aes(colour=sig)) +
    geom_pointrange(aes(colour=sig)) +
    scale_colour_manual(values=c("grey","black")) +
    facet_wrap(.~ behavior) +
    coord_flip() +
    scale_x_discrete(labels = rev(c(
        "has fair overall health",
        "has good overall health",
        "has excellent overall health",
        "female",
        ">60 years old",
        "50-59 years old",
        "40-49 years old",
        "30-39 years old"
    ))) +
    geom_hline(yintercept=0, colour="red") +
    ylab("effect on past week's\nhealthy behavior (out of 100)") +
    theme_bw() + 
    theme(legend.position = "none")
ggsave("fig/behavior_indiv_predictors.pdf", height=5, width=8)
ggsave("fig/behavior_indiv_predictors.png", height=5, width=8)

#----------------------------------------------------------
# predictors of overall behavior
#----------------------------------------------------------

beh <- lm(past_behavior_index ~ age_group + health + gender, responses_df)
summary(beh)
# Call:
#     lm(formula = past_behavior_index ~ age_group + health + gender, 
#        data = responses_df)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -93.121  -4.608   5.366   9.089  17.367 
# 
# Coefficients:
#     Estimate Std. Error t value Pr(>|t|)    
#     (Intercept)      82.4701     1.8373  44.886  < 2e-16 ***
#     age_group30-39    2.4495     0.7909   3.097  0.00197 ** 
#     age_group40-49    2.9309     0.7755   3.779  0.00016 ***
#     age_group50-59    3.8340     0.8336   4.599 4.40e-06 ***
#     age_group60+      5.5365     0.9342   5.927 3.41e-09 ***
#     healthFair        0.1632     1.7953   0.091  0.92756    
#     healthGood        2.9041     1.7400   1.669  0.09521 .  
#     healthExcellent   3.8849     1.8000   2.158  0.03098 *  
#     genderFemale      2.4108     0.5092   4.734 2.29e-06 ***
#     genderOther       2.0248     4.1873   0.484  0.62874    
#     ---
#     Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 14.44 on 3263 degrees of freedom
# (179 observations deleted due to missingness)
# Multiple R-squared:  0.02109,	Adjusted R-squared:  0.01839 
# F-statistic: 7.811 on 9 and 3263 DF,  p-value: 1.804e-11

## assess model fit via LRT
beh.sat1 <- lm(past_behavior_index ~ age_group + health + gender + age_group:health, responses_df)
beh.sat2 <- lm(past_behavior_index ~ age_group + health + gender + age_group:health + gender:age_group, responses_df)
beh.sat3 <- lm(past_behavior_index ~ age_group + health + gender + age_group:health + gender:age_group + gender:health, responses_df)
anova(beh, beh.sat1, beh.sat2, beh.sat3, test="Chisq")
# Res.Df    RSS     Df Sum of Sq Pr(>Chi)  
# 1   3238  672726                         
# 2   3251  677474 -13   -4748.1  0.04346 *
# 3   3244  674856   7    2618.2  0.08242 .
# 4   3238  672726   6    2129.9  0.11444  
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1



beh_coef <- summary(beh)$coefficients
beh_coef_df <- data.frame(
    y=beh_coef[,1], 
    variable=rownames(beh_coef),
    ymin=beh_coef[,1]-beh_coef[,2],
    ymax=beh_coef[,1]+beh_coef[,2]
)

levels(beh_coef_df$variable) <- c(
    "(Intercept)", "age_group30-39", "age_group40-49", "age_group50-59",
    "age_group60+", "genderFemale", "genderOther", "healthExcellent",
    "healthGood", "healthFair"
)
beh_coef_df %>%
    mutate(y=y*100, ymin=ymin*100, ymax=ymax*100) %>%
    mutate(sig=factor(ifelse(ymin < 0 & ymax > 0, 0, 1))) %>%
    filter(variable != "(Intercept)") %>%
    filter(variable != "genderOther") %>%
    ggplot(aes(x=variable, y=y, ymin=ymin, ymax=ymax)) +
    geom_point(aes(colour=sig)) +
    geom_pointrange(aes(colour=sig)) +
    scale_colour_manual(values=c("grey","black")) +
    coord_flip() +
    scale_x_discrete(labels = rev(c(
        "has fair overall health",
        "has good overall health",
        "has excellent overall health",
        "female",
        ">60 years old",
        "50-59 years old",
        "40-49 years old",
        "30-39 years old"
    ))) +
    geom_hline(yintercept=0, colour="red") +
    ylab("effect on past week's\noverall healthy behavior (out of 100)") +
    theme_bw() + 
    theme(legend.position = "none")
ggsave("fig/behavior_predictors.pdf", height=4, width=8)
ggsave("fig/behavior_predictors.png", height=4, width=8)
